knitr::opts_chunk$set(fig.width = 10, fig.height = 8)
require(data.table)
require(magrittr)
require(tidyverse)
require(stringr)
require(ggthemes)
require(ggthemr)
require(GGally)
require(cowplot)
require(corrr)
require(VIM)
require(h2o)
require(h2oEnsemble)
# Initialize h2o
h2o.init()
 Connection successful!

R is connected to the H2O cluster: 
    H2O cluster uptime:         4 days 1 minutes 
    H2O cluster version:        3.10.2.2 
    H2O cluster version age:    23 days  
    H2O cluster name:           H2O_started_from_R_jamesblair_wtp595 
    H2O cluster total nodes:    1 
    H2O cluster total memory:   1.68 GB 
    H2O cluster total cores:    4 
    H2O cluster allowed cores:  2 
    H2O cluster healthy:        TRUE 
    H2O Connection ip:          localhost 
    H2O Connection port:        54321 
    H2O Connection proxy:       NA 
    R Version:                  R version 3.3.2 (2016-10-31) 
home_train <- fread("../data/train.csv")
home_test <- fread("../data/test.csv")  # Missing response - used for Kaggle submission
names(home_train)
 [1] "Id"            "MSSubClass"    "MSZoning"      "LotFrontage"   "LotArea"       "Street"       
 [7] "Alley"         "LotShape"      "LandContour"   "Utilities"     "LotConfig"     "LandSlope"    
[13] "Neighborhood"  "Condition1"    "Condition2"    "BldgType"      "HouseStyle"    "OverallQual"  
[19] "OverallCond"   "YearBuilt"     "YearRemodAdd"  "RoofStyle"     "RoofMatl"      "Exterior1st"  
[25] "Exterior2nd"   "MasVnrType"    "MasVnrArea"    "ExterQual"     "ExterCond"     "Foundation"   
[31] "BsmtQual"      "BsmtCond"      "BsmtExposure"  "BsmtFinType1"  "BsmtFinSF1"    "BsmtFinType2" 
[37] "BsmtFinSF2"    "BsmtUnfSF"     "TotalBsmtSF"   "Heating"       "HeatingQC"     "CentralAir"   
[43] "Electrical"    "1stFlrSF"      "2ndFlrSF"      "LowQualFinSF"  "GrLivArea"     "BsmtFullBath" 
[49] "BsmtHalfBath"  "FullBath"      "HalfBath"      "BedroomAbvGr"  "KitchenAbvGr"  "KitchenQual"  
[55] "TotRmsAbvGrd"  "Functional"    "Fireplaces"    "FireplaceQu"   "GarageType"    "GarageYrBlt"  
[61] "GarageFinish"  "GarageCars"    "GarageArea"    "GarageQual"    "GarageCond"    "PavedDrive"   
[67] "WoodDeckSF"    "OpenPorchSF"   "EnclosedPorch" "3SsnPorch"     "ScreenPorch"   "PoolArea"     
[73] "PoolQC"        "Fence"         "MiscFeature"   "MiscVal"       "MoSold"        "YrSold"       
[79] "SaleType"      "SaleCondition" "SalePrice"    

Introduction

This data set comes from this kaggle competition. The data contains data on home sales in Ames, Iowa, from 2006 - 2010. There are 82 features in the data, including the price of the home when it sold (SalePrice). The object of the Kaggle competition is to accurately predict SalePrice for a predefined test set of homes. We will explore the data to gain understanding and look to expose any underlying patterns before modeling the data and attempting to climb into the top 10% of the Kaggle leaderboard.

Exploratory Data Analysis

Before jumping into looking at the data, a few simple pre processing steps are necessary. Some of the numeric features in the data, like quality scores, are actually categorical in nature. We’ll transform those features into factors and we’ll also identify and separate discrete features from continuous features so they can be examined separately. We’ll also combine the data so that we can make easily make comparisons between the test and the train set to check that feature distributions are approximately identical.

# Combine data sets with distinction between train and test
home_train[,split := "train"]
home_test[,split := "test"]
home_test[,SalePrice := NA]
home_data <- rbind(home_train, home_test)
# Convert features into proper format based on data_description.txt
home_data[,c("MSSubClass",
             "OverallQual",
             "OverallCond") := list(
                as.factor(MSSubClass),
                as.factor(OverallQual),
                as.factor(OverallCond)
              )]
# Number of discrete and continuous features
sapply(home_data, class) %>% 
  table
.
character    factor   integer 
       44         3        35 
# Define dependent variable
y <- "SalePrice"
# Setnames to be more program friendly
setnames(home_data, names(home_data)[str_detect(names(home_data), "^[0-9]")], paste0("h", names(home_data)[str_detect(names(home_data), "^[0-9]")]))
# Extract names of continuous features
cont_features <- names(home_data)[sapply(home_data, class) == "integer"]
cont_features <- setdiff(cont_features, c(y, "Id"))
# Extract names of discrete features
disc_features <- names(home_data)[sapply(home_data, class) %in% c("character", "factor")]
disc_features <- setdiff(disc_features, "split")

Response Variable

Before jumping into examining the features, it is important to understand the response variable, in this case SalePrice. The following histogram illustrates the distribution of SalePrice in the data.

This histogram clearly indicates the data is somewhat right skewed. In an attempt to control for that, we’ll look at what the distribution looks like after a log transformation and compare it with the original distribution in the following plot.

home_data[,log_SalePrice := log(SalePrice)]
# Histogram of log sale price
lsp_hist <- home_data %>% 
  ggplot(aes(x = log_SalePrice)) +
  geom_histogram() +
  theme_minimal() +
  labs(title = "Histogram of log(SalePrice)")
# Side by side comparison of origional distribution with log distribution
plot_grid(sp_hist, lsp_hist)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Removed 1459 rows containing non-finite values (stat_bin).`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Removed 1459 rows containing non-finite values (stat_bin).

The log transformation appeared to help with the skewness of the data so the remainder of the analysis will use log_SalePrice.

Missingness

Now, we want to check for missing values. The VIM package has some nice functions for visualizing missing values and so we’ll use those. Essentially, we want to look for features with a high volumn of missing values and we also want to see if there are prevelant patterns in features that are often missing together. That could indicate a latent variable that isn’t being measured but that we could account for with a dummy variable.

There are quite a few columns dominated by missing values. In addition, there are a couple of combinations of missing values in observations that happen rather frequently, possibly an indication of some latent variable. In order to help machine learning models pick up on these similar observations, we’ll add a missingness feature to the data to encode common missingness combinations.

# Add missingness feature
home_data[,missingness := "other"]
home_data[is.na(Alley) & is.na(PoolQC) & is.na(Fence) & is.na(MiscVal), missingness := "one"]
home_data[is.na(Alley) & is.na(PoolQC) & is.na(Fence) & is.na(MiscVal) & is.na(FireplaceQu), missingness := "two"]

Discrete Features

Now, we will perform a more in depth examination of the data by looking at discrete features first. We’re interested in determining a few things. First, we want to see how many observations there are of each value for each discrete feature. Features that appear to sparse may need to be removed from the data. Second, we want to look at each level of each discrete feature and how it relates to log_SalePrice to determine which features provide strong signals for log_SalePrice. To address the first point, we will cerate bar plots for each discrete feature. To address the second point, we will look at boxplots for each of the same features.

# Count of distinct values in each discrete feature
disc_bar <- home_data[,c(disc_features, "split"), with = FALSE] %>% 
  melt(id.vars = "split", measure.vars = disc_features) %>% 
  ggplot(aes(x = value, fill = split)) +
  geom_bar(na.rm = FALSE, position = "dodge") +
  facet_wrap(~variable, scales = "free") +
  theme_tufte() +
  theme(axis.text.y = element_blank(),
        axis.text.x = element_text(angle = 45)) +
  labs("Discrete Feature Barplots")
# Boxplot of response variable for discrete features
disc_box <- home_data[split == "train",c(disc_features, y2, "split"), with = FALSE] %>% 
  melt(id.vars = c("split", y2), measure.vars = disc_features) %>% 
  ggplot(aes(x = value, y = log_SalePrice)) +
  geom_boxplot() +
  facet_wrap(~variable, scales = "free") +
  theme_tufte() +
  theme(axis.text.y = element_blank(),
        axis.text.x = element_text(angle = 45)) +
  labs("Discrete Feature Boxplots")
plot_grid(disc_bar, disc_box, nrow = 2)

The above plots illustrate some important insights. First, Street, Utlilities, and PoolQC each appear to be rather sparse. However, PoolQC does seem to have a strong influence on log_SalePrice. There are several additional variables that seem to strongly indicate logSalePrice. Sepcifically, OverallQual, OverallCond have rather distinct patterns. To determine of these two variable’s can be further exploited, we’ll look at how they interact with one another. The following heatmaps illustrate the interaction between OverallQual and OverallCond. The heatmap on the left illustrates how frequently various combinations of these features occur together while the heatmap on the right illustrates the

n_heat <- home_data[,.N, by = .(split, OverallQual, OverallCond)] %>% 
  ggplot(aes(x = OverallQual, y = OverallCond, fill = N)) +
  geom_raster() +
  theme_minimal() +
  labs(title = "Heatmap of OverallCond and OverallQual")
lsp_heat <- home_data[split == "train",.(avg_log_SalePrice = mean(log_SalePrice)), by = .(OverallCond, OverallQual)] %>% 
  ggplot(aes(x = OverallQual, y = OverallCond, fill = avg_log_SalePrice)) +
  geom_raster() +
  theme_minimal() +
  labs(title = "Heatmap of OverallCond and OverallQual and log_SalePrice")
plot_grid(n_heat, lsp_heat)

As can be seen from the heatmaps, the majority of homes are sold with OverallQual between 4 and 8 and OverallCond between 5 and 7. This pattern remains consistent across test and train data. As expected, average log_SalePrice increases as both OveralQual and OverallCond increase.

Continuous Features

Now that we have at least some idea of what is happening with the discrete features, we’ll take a look at what is happening with the continuous features. First, we’ll look at a correlation plot of each continuous feature to determine the strength of relationship both among features and between featuers and log_SalePrice.

# Correlation matrix
house_corplot <- home_data[split == "train",c(cont_features, y2), with = FALSE] %>% 
  correlate %>% 
  rearrange() %>% 
  shave() %>% 
  rplot()
  
house_corplot +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

The correlation plot indicates that several features dealing with the size of the home are strongly correlated with log_SalePrice. Many of these features are also correlated with one another. While this would be an issue with general linear models, the statistical methods we will use should handle collinearity well.

# Scatter plot pairs
home_data[split == "train", c(cont_features, y), with = FALSE] %>% 
  ggpairs()

Quick Initial Model

# Move data to h2o (training only needed now)
fwrite(home_data[split == "train"], "../data/train_data_eng.csv")
train_h <- h2o.importFile("../data/train_data_eng.csv", destination_frame = "train_h")

home_df_rf <- h2o.randomForest(
  x = c(cont_features, disc_features),
  y = y,
  training_frame = train_h,
  model_id = "home_df_rf"
)

home_df_rf

h2o.varimp(home_df_rf) %>% 
  as.data.table %>% 
  .[,.(variable, scaled_importance = round(scaled_importance, 2), percentage = round(percentage, 2))]

Random Explorings (DO NOT INCLUDE IN FINAL REPORT)

# Have home prices changed over time?
home_data %>% 
  ggplot(aes(x = SalePrice)) +
  geom_histogram() +
  facet_grid(YrSold ~ MoSold) +
  theme_tufte()

home_data %>% 
  ggplot(aes(x = as.factor(YrSold), y = SalePrice)) +
  geom_boxplot() +
  theme_tufte()

# Slight downward trend in home prices but nothing crazy
# Time series plot
home_data[,.(Avg_Sale_Price = mean(SalePrice, na.rm = TRUE)), by = .(YrSold, MoSold)][order(YrSold, MoSold),Avg_Sale_Price] %>% 
  as.ts(frequency = 12) %>% 
  plot

home_data[,table(YrSold, MoSold, split)]

Feature Engineering

# House age at time of sale (YrSold - YearBuilt)

Data Modeling

Limited number of observations. Cross validation will be used to maximize training potential.

Kaggle Submission

---
title: "Intro to Data Vis HW 2"
output: html_notebook
---

```{r setup}
knitr::opts_chunk$set(fig.width = 10, fig.height = 8)

require(data.table)
require(magrittr)
require(tidyverse)
require(stringr)
require(ggthemes)
require(ggthemr)
require(GGally)
require(cowplot)
require(corrr)
require(VIM)
require(h2o)
require(h2oEnsemble)

# Initialize h2o
h2o.init()
```

```{r load data}
home_train <- fread("../data/train.csv")
home_test <- fread("../data/test.csv")  # Missing response - used for Kaggle submission

names(home_train)
```


## Introduction
This data set comes from [this](https://www.kaggle.com/c/house-prices-advanced-regression-techniques) kaggle competition. The data contains data on home sales in Ames, Iowa, from 2006 - 2010. There are `r ncol(home_train)` features in the data, including the price of the home when it sold (`SalePrice`). The object of the Kaggle competition is to accurately predict `SalePrice` for a predefined test set of homes. We will explore the data to gain understanding and look to expose any underlying patterns before modeling the data and attempting to climb into the top 10% of the Kaggle leaderboard.

## Exploratory Data Analysis
Before jumping into looking at the data, a few simple pre processing steps are necessary. Some of the numeric features in the data, like quality scores, are actually categorical in nature. We'll transform those features into factors and we'll also identify and separate discrete features from continuous features so they can be examined separately. We'll also combine the data so that we can make easily make comparisons between the test and the train set to check that feature distributions are approximately identical.

```{r pre processing}
# Combine data sets with distinction between train and test
home_train[,split := "train"]
home_test[,split := "test"]
home_test[,SalePrice := NA]

home_data <- rbind(home_train, home_test)

# Convert features into proper format based on data_description.txt
home_data[,c("MSSubClass",
             "OverallQual",
             "OverallCond") := list(
                as.factor(MSSubClass),
                as.factor(OverallQual),
                as.factor(OverallCond)
              )]

# Number of discrete and continuous features
sapply(home_data, class) %>% 
  table

# Define dependent variable
y <- "SalePrice"

# Setnames to be more program friendly
setnames(home_data, names(home_data)[str_detect(names(home_data), "^[0-9]")], paste0("h", names(home_data)[str_detect(names(home_data), "^[0-9]")]))

# Extract names of continuous features
cont_features <- names(home_data)[sapply(home_data, class) == "integer"]
cont_features <- setdiff(cont_features, c(y, "Id"))

# Extract names of discrete features
disc_features <- names(home_data)[sapply(home_data, class) %in% c("character", "factor")]
disc_features <- setdiff(disc_features, "split")
```

### Response Variable
Before jumping into examining the features, it is important to understand the response variable, in this case `SalePrice`. The following histogram illustrates the distribution of `SalePrice` in the data.

```{r SalePrice eda}
sp_hist <- home_data %>% 
  ggplot(aes(x = SalePrice)) +
  geom_histogram() +
  theme_minimal() +
  labs(title = "Histogram of Sale Price")

sp_hist
```

This histogram clearly indicates the data is somewhat right skewed. In an attempt to control for that, we'll look at what the distribution looks like after a log transformation and compare it with the original distribution in the following plot.

```{r log sp}
home_data[,log_SalePrice := log(SalePrice)]

# Histogram of log sale price
lsp_hist <- home_data %>% 
  ggplot(aes(x = log_SalePrice)) +
  geom_histogram() +
  theme_minimal() +
  labs(title = "Histogram of log(SalePrice)")

# Side by side comparison of origional distribution with log distribution
plot_grid(sp_hist, lsp_hist)

# Define y2 as log_SalePrice
y2 <- "log_SalePrice"
```

The log transformation appeared to help with the skewness of the data so the remainder of the analysis will use `log_SalePrice`.

### Missingness
Now, we want to check for missing values. The `VIM` package has some nice functions for visualizing missing values and so we'll use those. Essentially, we want to look for features with a high volumn of missing values and we also want to see if there are prevelant patterns in features that are often missing together. That could indicate a latent variable that isn't being measured but that we could account for with a dummy variable.

```{r missingness}
# Explore missingness of data
aggr(home_data[, -c(y, y2), with = FALSE], combined = TRUE)
```

There are quite a few columns dominated by missing values. In addition, there are a couple of combinations of missing values in observations that happen rather frequently, possibly an indication of some latent variable. In order to help machine learning models pick up on these similar observations, we'll add a `missingness` feature to the data to encode common missingness combinations.

```{r}
# Add missingness feature
home_data[,missingness := "other"]
home_data[is.na(Alley) & is.na(PoolQC) & is.na(Fence) & is.na(MiscVal), missingness := "one"]
home_data[is.na(Alley) & is.na(PoolQC) & is.na(Fence) & is.na(MiscVal) & is.na(FireplaceQu), missingness := "two"]
```

### Discrete Features
Now, we will perform a more in depth examination of the data by looking at discrete features first. We're interested in determining a few things. First, we want to see how many observations there are of each value for each discrete feature. Features that appear to sparse may need to be removed from the data. Second, we want to look at each level of each discrete feature and how it relates to `log_SalePrice` to determine which features provide strong signals for `log_SalePrice`. To address the first point, we will cerate bar plots for each discrete feature. To address the second point, we will look at boxplots for each of the same features.

```{r discrete features}
# Count of distinct values in each discrete feature
disc_bar <- home_data[,c(disc_features, "split"), with = FALSE] %>% 
  melt(id.vars = "split", measure.vars = disc_features) %>% 
  ggplot(aes(x = value, fill = split)) +
  geom_bar(na.rm = FALSE, position = "dodge") +
  facet_wrap(~variable, scales = "free") +
  theme_tufte() +
  theme(axis.text.y = element_blank(),
        axis.text.x = element_text(angle = 45)) +
  labs("Discrete Feature Barplots")

# Boxplot of response variable for discrete features
disc_box <- home_data[split == "train",c(disc_features, y2, "split"), with = FALSE] %>% 
  melt(id.vars = c("split", y2), measure.vars = disc_features) %>% 
  ggplot(aes(x = value, y = log_SalePrice)) +
  geom_boxplot() +
  facet_wrap(~variable, scales = "free") +
  theme_tufte() +
  theme(axis.text.y = element_blank(),
        axis.text.x = element_text(angle = 45)) +
  labs("Discrete Feature Boxplots")

plot_grid(disc_bar, disc_box, nrow = 2)
```

The above plots illustrate some important insights. First, `Street`, `Utlilities`, and `PoolQC` each appear to be rather sparse. However, `PoolQC` does seem to have a strong influence on `log_SalePrice`. There are several additional variables that seem to strongly indicate `logSalePrice`. Sepcifically, `OverallQual`, `OverallCond` have rather distinct patterns. To determine of these two variable's can be further exploited, we'll look at how they interact with one another. The following heatmaps illustrate the interaction between `OverallQual` and `OverallCond`. The heatmap on the left illustrates how frequently various combinations of these features occur together while the heatmap on the right illustrates the 

```{r qual interactions}
n_heat <- home_data[,.N, by = .(split, OverallQual, OverallCond)] %>% 
  ggplot(aes(x = OverallQual, y = OverallCond, fill = N)) +
  geom_raster() +
  theme_minimal() +
  labs(title = "Heatmap of OverallCond and OverallQual")

lsp_heat <- home_data[split == "train",.(avg_log_SalePrice = mean(log_SalePrice)), by = .(OverallCond, OverallQual)] %>% 
  ggplot(aes(x = OverallQual, y = OverallCond, fill = avg_log_SalePrice)) +
  geom_raster() +
  theme_minimal() +
  labs(title = "Heatmap of OverallCond and OverallQual and log_SalePrice")

plot_grid(n_heat, lsp_heat)
```

As can be seen from the heatmaps, the majority of homes are sold with `OverallQual` between 4 and 8 and `OverallCond` between 5 and 7. This pattern remains consistent across test and train data. As expected, average `log_SalePrice` increases as both `OveralQual` and `OverallCond` increase.

### Continuous Features
Now that we have at least some idea of what is happening with the discrete features, we'll take a look at what is happening with the continuous features. First, we'll look at a correlation plot of each continuous feature to determine the strength of relationship both among features and between featuers and `log_SalePrice`. 

```{r continuous features}
# Correlation matrix
house_corplot <- home_data[split == "train",c(cont_features, y2), with = FALSE] %>% 
  correlate %>% 
  rearrange() %>% 
  shave() %>% 
  rplot()
  
house_corplot +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
```


```{r}
# Scatter plot pairs
home_data[split == "train", c(cont_features, y), with = FALSE] %>% 
  cor

home_data$YearBuilt %>% summary
home_data[,decade_built := cut(YearBuilt, seq(1869, 2019, by = 10), labels = paste0(seq(1870, 2010, by = 10), "s"), include.lowest = TRUE)]
home_data[,.N, by = decade_built]
home_data[,length(unique(decade_built))]
length(seq(1870, 2010, by = 10))
home_data[,summary(YearBuilt), by = decade_built]


home_data[split == "train",.(avg_log_SalePrice = mean(log_SalePrice)), by = .(decade_built, Neighborhood)] %>% 
  ggplot(aes(x = decade_built, y = avg_log_SalePrice, col = Neighborhood, group = Neighborhood)) +
  geom_line() +
  theme_minimal()

home_data[,table(Neighborhood, decade_built)]

home_data[,.N, by = .(Neighborhood, decade_built)] %>% 
  ggplot(aes(x = decade_built, y = Neighborhood, fill = N)) +
  geom_raster() +
  theme_minimal()
```


The correlation plot indicates that several features dealing with the size of the home are strongly correlated with `log_SalePrice`. Many of these features are also correlated with one another. While this would be an issue with general linear models, the statistical methods we will use should handle collinearity well.

```{r}
treemap::treemap(home_data[split == "train", .(size_var = .N), by = .(decade_built, Neighborhood)], #Your data frame object
        index=c("Neighborhood", "decade_built"),  #A list of your categorical variables
        vSize = "size_var",  #This is your quantitative variable
        type="index", #Type sets the organization and color scheme of your treemap
        palette = "Set3",  #Select your color palette from the RColorBrewer presets or make your own.
        title="Sale Price by Neighborhood and Decade Built", #Customize your title
        fontsize.title = 14, #Change the font size of the title
        position.legend = 'none'
        )
```


```{r}
# Scatter plot pairs
home_data[split == "train", c(cont_features, y), with = FALSE] %>% 
  ggpairs()
```



### Quick Initial Model
```{r}
# Move data to h2o (training only needed now)
fwrite(home_data[split == "train"], "../data/train_data_eng.csv")
train_h <- h2o.importFile("../data/train_data_eng.csv", destination_frame = "train_h")

home_df_rf <- h2o.randomForest(
  x = c(cont_features, disc_features),
  y = y,
  training_frame = train_h,
  model_id = "home_df_rf"
)

home_df_rf

h2o.varimp(home_df_rf) %>% 
  as.data.table %>% 
  .[,.(variable, scaled_importance = round(scaled_importance, 2), percentage = round(percentage, 2))]
```

### Random Explorings (DO NOT INCLUDE IN FINAL REPORT)
```{r}
# Have home prices changed over time?
home_data %>% 
  ggplot(aes(x = SalePrice)) +
  geom_histogram() +
  facet_grid(YrSold ~ MoSold) +
  theme_tufte()

home_data %>% 
  ggplot(aes(x = as.factor(YrSold), y = SalePrice)) +
  geom_boxplot() +
  theme_tufte()

# Slight downward trend in home prices but nothing crazy
# Time series plot
home_data[,.(Avg_Sale_Price = mean(SalePrice, na.rm = TRUE)), by = .(YrSold, MoSold)][order(YrSold, MoSold),Avg_Sale_Price] %>% 
  as.ts(frequency = 12) %>% 
  plot

home_data[,table(YrSold, MoSold, split)]
```


## Feature Engineering
```{r}
# House age at time of sale (YrSold - YearBuilt)
```


## Data Modeling
Limited number of observations. Cross validation will be used to maximize training potential.
```{r h2o setup}

```

## Kaggle Submission

